# -*- coding:utf8 -*-
# !/usr/bin/env python

"""
#全国企业信用信息公示系统（河北）
#维护黄羽
"""

import re
from scpy.logger import get_logger
import copy
import sys
import requests
from utils import kill_captcha
from scpy.request_util import *
import hashlib
from table import index, report_index, table_clean, parse_time
import sd_trans_dict as TR
import sd_template_dict as TE
import sd_format as FO
import time
import traceback
import json
import time

reload(sys)
sys.setdefaultencoding('utf8')
logger = get_logger(__file__)

ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'


def get_ent(companyName):
    req = requests.session()
    # proxy = {'http': u'http://59.42.251.197:80'}
    index_url = 'http://www.hebscztxyxx.gov.cn/notice/'
    req.headers = {
        'Connection': 'keep-alive',
        'User-Agent': ua,
    }

    index_res = req.get(url=index_url).content
    # print index_res
    res_token = re.findall('name="session\.token".*?value="(.*?)"', index_res)
    if res_token:
        res_token = res_token[0]
    else:
        raise ValueError("session.token error")

    captcha_url = 'http://www.hebscztxyxx.gov.cn/notice/captcha?preset=&ra=0.17911669868044555'

    req.headers = {
        'Accept': 'image/webp,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'www.hebscztxyxx.gov.cn',
        'Referer': 'http://www.hebscztxyxx.gov.cn/notice/search/popup_captcha',
        'User-Agent': ua,
    }

    img_res = req.get(url=captcha_url).content

    verify_url = 'http://www.hebscztxyxx.gov.cn/notice/security/verify_captcha'
    verify_data = {
        'captcha': '12',
        'session.token': res_token,
    }

    verify_res = req.post(url=verify_url, data=verify_data).content
    # print verify_res

    ent_info_list_url = 'http://www.hebscztxyxx.gov.cn/notice/search/ent_info_list'

    ent_info_list_data = {
        'searchType': '1',
        'captcha': '12',
        'session.token': res_token,
        'condition.keyword': companyName,
    }

    ent_info_list_res = req.post(url=ent_info_list_url, data=ent_info_list_data).content
    req.close()
    # print ent_info_list_res
    ent_url = re.findall('class="link"><a href="(.*?)"', ent_info_list_res)
    if ent_url:
        return ent_url[0]
    else:
        return None


def get_raw_html(ent_url):
    if not ent_url:
        return None

    raw_html_dict = {
        'province': 'heb',
        'type': '0',
        'html': '',
        'yearList': [],
        'keyword': '',
        'companyName': '',
        'json': '',
    }

    req = requests.session()
    req.headers = {
        'User-Agent': ua,
    }
    base_html = req.get(url=ent_url).content
    logger.info("获取基本信息结束！")
    raw_html_dict['html'] = base_html

    year_list_url = ent_url.replace('tab=01', 'tab=02')
    year_list_page = req.get(url=year_list_url).content
    report_url_tuple = re.findall('<a href="(.*?)".*?(\d+?)年度报告', year_list_page)

    raw_html_report_list = []
    if report_url_tuple:
        for item in report_url_tuple:
            if not item or not item[0] or not item[1]:
                # raise Exception("网站发生变化")
                continue
            raw_html_report_dict = {}
            logger.info("获取年报！")
            raw_html_report_html = req.get(url=item[0]).content
            raw_html_report_dict[item[1]] = raw_html_report_html
            raw_html_report_list.append(raw_html_report_dict)

    raw_html_dict['yearList'] = raw_html_report_list

    return raw_html_dict


def extract_share_holder_list(share_holder_table):
    """
    解析股东信息table
    :param share_holder_table:股东信息的网页
    :return:解析后的字典
    """
    share_holder_list = []
    html = re.sub('<tr.*?>', '<tr>', share_holder_table)
    html = html.replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '')
    detail = re.findall('<tr>.*?</tr>', html)
    if detail and len(detail) > 3:
        for item in detail[2: -1]:
            html = item.replace('<td>', '@@@@@')
            html = re.sub('<td.+?>', '<td>', html)
            html = html.replace('@@@@@', '<td>')
            html = html.replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '')
            detail_td = re.findall("<td>(.*?)</td>", html)
            if detail_td and len(detail_td) >= 4:
                dic1 = {"shareholderType": detail_td[0], "shareholderName": detail_td[1],
                        "regCapCur": "", "country": "", "fundedRatio": "", "subConam": "", "conDate": ""}
                req_url = re.findall('ahref="(.*?)"', detail_td[4])
                if req_url:
                    req = requests.session()
                    req.headers = {'User-Agent': ua, }
                    share_res = req.get(url=req_url[0]).content

                    # 认缴,目标网站是js,实缴认缴合不了
                    invt_item = re.findall(
                        '''invt\.subConAm = "(.*?)".*?invt\.conDate = '(.*?)'.*?invt\.conForm = "(.*?)"''', share_res,
                        re.S)
                    for invt in invt_item:
                        if invt and len(invt):
                            invt_dict = copy.deepcopy(dic1)
                            invt_dict["subConam"] = float(invt[0])
                            invt_dict["conDate"] = parse_time(invt[1])
                            invt_dict["regCapCur"] = invt[2]
                            share_holder_list.append(invt_dict)

                            # # 实缴
                            # invtActl_item = re.findall('''invtActl\.acConAm = "(.*?)".*?invtActl\.conDate = '(.*?)'.*?invtActl\.conForm = "(.*?)"''', share_res, re.S)
                            # for invtActl in invtActl_item:
                            #     if invtActl and len(invtActl):
                            #         invtActl_dict = copy.deepcopy(dic1)
                            #         share_holder_list.append(invtActl_dict)
                i = 0
                for a_share in share_holder_list:
                    if a_share['shareholderName'] == detail_td[1]:
                        break
                    else:
                        i += 1
                if i == len(share_holder_list):
                    share_holder_list.append(dic1)
            else:
                raise Exception("添加网页解析方法！")

        return share_holder_list
    else:
        return []


def extract_base(html_res):
    """
    解析工商基本信息
    :param html_res:html源码
    :return:解析后的工商字典
    """
    if not html_res:
        return None

    init_dict = {
        'province': 'heb',
        'basicList': [],
        'shareHolderList': [],
        'personList': [],
        'punishBreakList': [],
        'alidebtList': [],
        'entinvItemList': [],
        'frinvList': [],
        'frPositionList': [],
        'alterList': [],
        'filiationList': [],
        'caseInfoList': [],
        'sharesFrostList': [],
        'sharesImpawnList': [],
        'morDetailList': [],
        'morguaInfoList': [],
        'liquidationList': [],
        'checkMessage': [],
        "abnormalOperation": [],
    }
    res_dict = copy.deepcopy(init_dict)
    res = html_res
    base_table = table_clean(res, "基本信息")
    if base_table:
        basic_list = index("基本信息", base_table)
        res_dict['basicList'] = basic_list

    share_holder_table = table_clean(res, "股东信息")
    if share_holder_table:
        share_holder_list = extract_share_holder_list(share_holder_table)
        res_dict['shareHolderList'] = share_holder_list

    alter_table = table_clean(res, "变更信息")
    if alter_table:
        alter_list = index("变更信息", alter_table)
        res_dict['alterList'] = alter_list

    person_table = table_clean(res, "主要人员信息")
    if person_table:
        person_list = index("主要人员信息", person_table)
        res_dict['personList'] = person_list

    filiation_table = table_clean(res, "分支机构信息")
    if filiation_table:
        filiation_list = index("分支机构信息", filiation_table)
        res_dict['filiationList'] = filiation_list

    liquidation_table = table_clean(res, "清算信息")
    if liquidation_table:
        liquidation_list = index("清算信息", liquidation_table)
        res_dict['liquidationList'] = liquidation_list

    abnormal_operation_table = table_clean(res, "经营异常信息")
    if abnormal_operation_table:
        abnormal_operation_list = index("经营异常信息", abnormal_operation_table)
        for item in abnormal_operation_list:
            recause = item.get('recause', None)
            item['recause'] = re.sub('<.+?>', ' ', recause) if recause else ''
        res_dict['abnormalOperation'] = abnormal_operation_list

    check_message_table = table_clean(res, "抽查检查信息")
    if check_message_table:
        check_message_list = index("抽查检查信息", check_message_table)
        res_dict['checkMessage'] = check_message_list

    return res_dict


def extract_year_report(year_report_html_list):
    """
    解析年报信息
    :param year_report_html_list: 年报源码
    :return: 年报字典
    """
    if not year_report_html_list:
        return []
    year_report_list = []
    init_report = {
        'baseInfo': {},
        'website': {},
        'investorInformations': [],
        'assetsInfo': {},
        'equityChangeInformations': [],
        'changeRecords': [],
    }

    for item in year_report_html_list:
        for year, html in item.items():

            report_dict = copy.deepcopy(init_report)

            report_basic_table = table_clean(html, '企业基本信息') + table_clean(html, "基本信息")
            if report_basic_table:
                report_basic_dict = report_index('企业基本信息', report_basic_table)
                report_dict['baseInfo'] = report_basic_dict

            report_website_table = table_clean(html, '网站或网店信息')
            if report_website_table:
                report_website_dict = report_index('网站或网店信息', report_website_table)
                report_dict['website'] = report_website_dict

            report_assetsInfo_table = table_clean(html, '企业资产状况信息')
            if report_assetsInfo_table:
                report_assetsInfo_dict = report_index('企业资产状况信息', report_assetsInfo_table)
                report_dict['assetsInfo'] = report_assetsInfo_dict

            report_investorInformations_table = table_clean(html, '股东及出资信息')
            if report_investorInformations_table:
                report_investorInformations_list = report_index('股东及出资信息', report_investorInformations_table)
                report_dict['investorInformations'] = report_investorInformations_list

            report_equityChangeInformations_table = table_clean(html, '股权变更信息')
            if report_equityChangeInformations_table:
                report_equityChangeInformations_list = report_index('股权变更信息', report_equityChangeInformations_table)
                report_dict['equityChangeInformations'] = report_equityChangeInformations_list

            report_changeRecords_table = table_clean(html, '修改记录')
            if report_changeRecords_table:
                report_changeRecords_list = report_index('修改记录', report_changeRecords_table)
                report_dict['changeRecords'] = report_changeRecords_list

            report_dict['year'] = year

            year_report_list.append(report_dict)

    return year_report_list


def search2(companyName):
    """
    对工商基本信息和年报信息进行整合
    :param companyName: 公司名字或注册号
    :return:None 或者　一个包含源码字典和工商基本信息字典的tuple
    """
    # res_dict = {}
    logger.info("开始获取网页！")
    ent_url = get_ent(companyName)
    if not ent_url:
        logger.info("公司不存在！")
        return None  # 公司不存在
    raw_html_dict = get_raw_html(ent_url)
    if not raw_html_dict:
        logger.info("公司不存在, 或者目标网站存在错误！")
        return None  # 公司不存在, 或者目标网站存在错误
    logger.info("获取网页完毕,开始解析！")
    raw_base = raw_html_dict.get('html', None)

    if not raw_base:
        raise Exception("网站发生变化")
    # 基本信息
    res_dict = extract_base(raw_base)
    # 年报信息
    year_report_html_list = raw_html_dict.get('yearList', [])  # 公司存在,年报不存在
    res_dict['yearReportList'] = extract_year_report(year_report_html_list) if year_report_html_list else []

    # 整理
    raw_html_dict['keyword'] = companyName
    raw_html_dict['companyName'] = res_dict['basicList'][0].get('enterpriseName', '')

    # 入口
    enter_method = {
        'companyName': raw_html_dict['companyName'],
        'url': ent_url,
        'method': 'get',
        'data': {},
        'province': 'heb',
    }
    logger.info("解析完毕！")
    return raw_html_dict, res_dict, enter_method
    # return raw_html_dict, res_dict


def search(companyName):
    """
    for 实时爬虫
    :param companyName: 公司名字或注册号
    :return:None 或者　一个包含工商基本信息字典
    """
    result = search2(companyName)
    if result is None:
        return None
    elif result and isinstance(result, tuple) and len(result) == 2:
        return result[1]
    else:
        raise Exception("错误!")


def search3(gate_method):
    ent_url = gate_method.get('url')
    company_mame = gate_method.get('companyName', '')
    raw_html_dict = get_raw_html(ent_url)
    if not raw_html_dict:
        logger.info("公司不存在, 或者目标网站存在错误！")
        return None  # 公司不存在, 或者目标网站存在错误
    logger.info("获取网页完毕,开始解析！")
    raw_base = raw_html_dict.get('html', None)

    if not raw_base:
        raise Exception("网站发生变化")
    # 基本信息
    res_dict = extract_base(raw_base)
    # 年报信息
    year_report_html_list = raw_html_dict.get('yearList', [])  # 公司存在,年报不存在
    res_dict['yearReportList'] = extract_year_report(year_report_html_list) if year_report_html_list else []

    # 整理
    # raw_html_dict['keyword'] = companyName
    company_mame = res_dict['basicList'][0].get('enterpriseName', '') or company_mame
    raw_html_dict['companyName'] = company_mame

    # 入口
    enter_method = {
        'companyName': company_mame,
        'url': ent_url,
        'method': 'get',
        'data': {},
        'province': 'heb',
    }
    logger.info("解析完毕！")
    return raw_html_dict, res_dict, enter_method


if __name__ == "__main__":
    companyName = '河北恒利集团有限公司'

    # ent_url = get_ent(companyName)
    # get_raw_html(ent_url)

    res = search2(companyName)
    import json

    print json.dumps(res, indent=4, ensure_ascii=False)
